#/bin/bash
#设置hadoop环境变量
export HADOOP_USER_NAME=hdfs

#获取全部流批程序文件
hadoop fs -ls /user/hdfs/.flink/|awk '{print $NF}'|awk -F"/" '{print $NF}'|grep application > yarn_all.txt
#获取正在运行的流批程序
yarn application -list -appStates RUNNING 2>/dev/null|awk '{print $1}'|grep application > yarn_running.txt
#获取running.txt与all.txt文件的差集，差集为未运行的流批垃圾文件，删除
sort yarn_all.txt yarn_running.txt yarn_running.txt | uniq -u > yarn_cha.txt
#读取差集文件，生成数组
log_file=(`cat yarn_cha.txt`)
for((i=0;i<${#log_file[@]};i++))
do
	#删除未运行的流批程序文件
	hadoop fs -rm -r -skipTrash /user/hdfs/.flink/${log_file[$i]}
	#删除未运行的流批日志文件
	hadoop fs -rm -r -skipTrash /tmp/logs/hdfs/logs/${log_file[$i]}
done


##也可以通过Python读取文件差集
#s1 = set(open('running.txt', 'r').readlines()) 
#s2 = set(open('all.txt', 'r').readlines())
#ff = open('shuchu.list','w')
#all_difference = list(set(s2).difference(set(s1)))
#
#for a in all_difference:
#	if("\n" not in a):
#		a=a+"\n"
#	ff.write(a)
#ff.close()